setwd("/home/armand/Desktop/phylophile_writeup/benchmarks/lanl_POL_CDS/selenium")
infile <- read.csv("timeFile.csv", header = FALSE, stringsAsFactors = FALSE)
library(reshape2)
library(ggplot2)
library(gridExtra)
colnames(infile) <- c("N_in", "process", "time")
#connvert N_in to actual values
infile$N_in <- as.integer(substr(infile$N_in, 19, length(infile$N_in)-14))
##blast hits vs queries
#create a new df with the N_in and N_blst_hits
blastHits <- infile[infile$process=="blastHits",][,-2]
colnames(blastHits) <- c("N_in", "N_blst")
#creat on theme for ggplot to use thoughout
phyloPiTheme <- theme_bw(base_size = 14) +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
#plot and fit N_blst against N_in, force through 0
fit <- lm(blastHits$N_blst ~ blastHits$N_in-1)
coefficients(fit)
## blastHits$N_in
## 4.628026
summary(fit)
##
## Call:
## lm(formula = blastHits$N_blst ~ blastHits$N_in - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.401 -0.121 2.497 4.735 8.183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## blastHits$N_in 4.62803 0.02803 165.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.808 on 49 degrees of freedom
## Multiple R-squared: 0.9982, Adjusted R-squared: 0.9982
## F-statistic: 2.726e+04 on 1 and 49 DF, p-value: < 2.2e-16
nBlstPlot <- ggplot(blastHits, aes(x = N_in, y = N_blst)) +
geom_point(shape = 1) +
geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) +
labs(x="Number of input sequences", y="Number of sequences retrieved by BLAST") +
annotate("text", x=41, y=72, label = "y == 4.628 * x", parse=T)+
annotate("text", x=40, y=60, label = "R^2 == 0.998", parse=T) +
phyloPiTheme
nBlstPlot

ggsave(file="blast hits vs queries.svg", plot=nBlstPlot)
## Saving 7 x 5 in image
#parse the dataframe for blast
blast <- infile[infile$process=="blast",][-2]
blast <- cbind(blastHits, blast)[-3]
fitBlastT <- lm(blast$time ~ blast$N_in-1)
coefficients(fitBlastT)
## blast$N_in
## 11.02104
summary(fitBlastT)
##
## Call:
## lm(formula = blast$time ~ blast$N_in - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7791 -1.3727 -0.4803 0.5859 4.2044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## blast$N_in 11.021038 0.008848 1246 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.833 on 49 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.552e+06 on 1 and 49 DF, p-value: < 2.2e-16
blastT <- ggplot(blast, aes(x = N_in, y = time)) +
geom_point(shape = 1) +
geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) +
labs(x="Number of input sequences", y="time (s)") +
annotate("text", x=40, y=200, label = "y == 11.02 * x", parse=T)+
annotate("text", x=40, y=170, label = "R^2 == 1", parse=T) +
phyloPiTheme
blastT

ggsave(file="blast time.svg", plot = blastT)
## Saving 7 x 5 in image
#parse the dataframe for mafft
mafft <- infile[infile$process=="mafftTime",][-2]
mafft <- cbind(blastHits, mafft)[-3]
#plot and fit time against N-blst, mafft
t <- mafft$time
N <- mafft$N_blst
fit <- nls(t~b * N**a, start = list(a=2,b=0.1))
cor(t,predict(fit))
## [1] 0.9993305
summary(fit)
##
## Formula: t ~ b * N^a
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 1.91841 0.01723 111.3 < 2e-16 ***
## b 0.15319 0.01380 11.1 7.5e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.09 on 48 degrees of freedom
##
## Number of iterations to convergence: 7
## Achieved convergence tolerance: 2.583e-08
mafftT <- ggplot(mafft, aes(x=N_blst, y=time))+
geom_point(shape = 1) +
geom_smooth(method="nls",
formula = y ~ b * x**a,
method.args=list(start = c(a = 2, b = 0.1)),
se = FALSE, colour='black', size = 0.25) +
labs(x="Number of sequences in alignment", y="time (s)") +
annotate("text", x=190, y=1800, label = "y == 0.153 * x^1.92", parse=T)+
phyloPiTheme
mafftT

ggsave(file="mafftTime.svg", plot=mafftT)
## Saving 7 x 5 in image
y <- mafft$time
x <- mafft$N_blst
fitMafft <- nls(y ~ b * x**a, start = list(a = 2, b = 0.1))
plot(x=mafft$N_blst, y=mafft$time)
lines(x, predict(fitMafft))

plot(x, resid(fitMafft))
abline(h=0)

qqnorm(resid(fitMafft))
qqline(resid(fitMafft))

#plot for fastTree
#for number of queries in input
fastTreeNq <- infile[infile$process == "fasttreeTime",][-2]
#add blastHits here
fastTreeWithBlstHits <- cbind(fastTreeNq,blastHits)[-3]
##an of time vs input query
fitFastTreeNq <- lm(fastTreeNq$time ~ blast$N_in-1)
coefficients(fitFastTreeNq)
## blast$N_in
## 3.021342
summary(fitFastTreeNq)
##
## Call:
## lm(formula = fastTreeNq$time ~ blast$N_in - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.308 -6.781 -2.839 2.161 19.742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## blast$N_in 3.02134 0.03424 88.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.094 on 49 degrees of freedom
## Multiple R-squared: 0.9937, Adjusted R-squared: 0.9936
## F-statistic: 7786 on 1 and 49 DF, p-value: < 2.2e-16
fastTreeNqP <- ggplot(fastTreeNq, aes(x = N_in, y = time)) +
geom_point(shape = 1) +
geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) +
labs(x="Number of query sequences", y="time (s)") +
annotate("text", x=40, y=70, label = "y == 3.02 * x", parse=T)+
annotate("text", x=40, y=60, label = "R^2 == .994", parse=T)+
phyloPiTheme
fastTreeNqP

ggsave(file="fastTree query sequences.svg", plot = fastTreeNqP)
## Saving 7 x 5 in image
##an of time vs blast results, thus number of samples in analysis
fitFastTreeWithBlstHits <- lm(fastTreeWithBlstHits$time ~ fastTreeWithBlstHits$N_blst -1)
coefficients(fitFastTreeWithBlstHits)
## fastTreeWithBlstHits$N_blst
## 0.6520651
summary(fitFastTreeWithBlstHits)
##
## Call:
## lm(formula = fastTreeWithBlstHits$time ~ fastTreeWithBlstHits$N_blst -
## 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.847 -6.985 -3.594 0.281 17.291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## fastTreeWithBlstHits$N_blst 0.652065 0.007718 84.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.407 on 49 degrees of freedom
## Multiple R-squared: 0.9932, Adjusted R-squared: 0.993
## F-statistic: 7139 on 1 and 49 DF, p-value: < 2.2e-16
fastTreeWithBlastHitsP <- ggplot(fastTreeWithBlstHits, aes(x = N_blst, y = time)) +
geom_point(shape = 1) +
geom_smooth(method=lm, se = FALSE, colour='black',formula=y~x-1, size = 0.25) +
labs(x="Number of sequences in alingment", y="time (s)") +
annotate("text", x=170, y=60, label = "y == 0.652 * x", parse=T)+
annotate("text", x=170, y=50, label = "R^2 == .993", parse=T)+
phyloPiTheme
fastTreeWithBlastHitsP

ggsave(file="fastTree total sequences.svg", plot = fastTreeWithBlastHitsP)
## Saving 7 x 5 in image
figure <- grid.arrange(nBlstPlot, blastT, mafftT, fastTreeWithBlastHitsP, nrow = 2, ncol = 2)

figure
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
ggsave(file = "benchFig.svg",plot = figure, width = 7.66, height = 10)